The Science of Startups: The Impact of Founder Personalities on Company Success # 2024-01-16, Xian Gong Output:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from matplotlib.collections import PolyCollection
from matplotlib.colors import to_rgb
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import plotly
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from sklearn.preprocessing import scale
import scipy
from random import random
import pandas.testing as tm
from pyclustertend import hopkins
import scipy.cluster.hierarchy as sch
from sklearn.manifold import TSNE
import time
from sklearn import datasets
from palmerpenguins import load_penguins
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from validclust import dunn
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
import statsmodels.api as sm
# Figure Quality
%matplotlib inline
plt.rcParams['figure.dpi'] = 300
It takes forever to run the code so I take smaller samples here.
# Load the data
df_input = pd.read_csv('Entrepreneurs_and_Employee_Scores_Anon.csv')
df_input = df_input.set_index('User')
df_input = df_input.sample(500)
# Load the data
#df_input = pd.read_csv('11085_Entrepreneurs_and_Employee_Percentile_Scores.csv')
#df_input = df_input.set_index('User')
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(df_input.iloc[:,:-1], df_input.iloc[:,[-1]], test_size=0.3, random_state=42)
# Tuning hyperparameters of SVM
# List of C values
C_range = np.logspace(-1, 1, 3)
print(f'The list of values for C are {C_range}')
# List of gamma values
gamma_range = np.logspace(-1, 1, 3)
print(f'The list of values for gamma are {gamma_range}')
The list of values for C are [ 0.1 1. 10. ] The list of values for gamma are [ 0.1 1. 10. ]
The following code takes very long to run:
# SVM Classifier
# Define the search space
param_grid = {
# Regularization parameter.
"C": C_range,
# Kernel type
"kernel": ['rbf', 'poly'],
# Gamma is the Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
"gamma": gamma_range.tolist()+['scale', 'auto']
}
# Set up score
scoring = ['roc_auc']
# Set up the k-fold cross-validation
kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
svc = SVC(probability=True)
# Define grid search
grid_search = GridSearchCV(estimator=svc,
param_grid=param_grid,
scoring=scoring,
refit='roc_auc',
n_jobs=-1,
cv=kfold,
verbose=0)
# Fit grid search
grid_result = grid_search.fit(X_train, y_train)
# Model Performance
print(f'The best accuracy score for the training dataset is {grid_result.best_score_:.4f}')
print(f'The best hyperparameters are {grid_result.best_params_}')
print(f'The accuracy score for the testing dataset is {grid_search.score(X_test, y_test):.4f}')
# Test set Performance
svm_best_model = grid_result.best_estimator_
y_pred_svm = svm_best_model.predict(X_test)
y_pred_log_proba_svm = svm_best_model.predict_log_proba(X_test)
y_pred_proba_svm = svm_best_model.predict_proba(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
auc_svm = roc_auc_score(y_test, y_pred_proba_svm[:, 1])
CM_svm = confusion_matrix(y_test,y_pred_svm)
TN_svm = CM_svm[0][0]
FN_svm = CM_svm[1][0]
TP_svm = CM_svm[1][1]
FP_svm = CM_svm[0][1]
specificity_svm = round(TN_svm / (TN_svm+FP_svm),4 )
ppv_svm = round(TP_svm / (TP_svm+FP_svm),4 )
print ("Accuracy score: {0}".format(accuracy_svm))
print ("F1 score: {0}".format(f1_svm))
print ("Specificity: {0}".format(specificity_svm))
print ("Positive Predictive Value: {0}".format(ppv_svm))
print ("AUC score: {0}".format(auc_svm))
# Figure 1A
data = (CM_svm/CM_svm.sum(axis = 1)).tolist()
fig = px.imshow(data,
labels=dict(x="<b>Predicted Label</b>", y="<b>True Label</b>", color="Percent", fontsize = 14),
x=['Employee', 'Entrepreneur'],
y=['Employee', 'Entrepreneur'],
# color_continuous_scale = px.colors.sequential.Blugrn,
color_continuous_scale = px.colors.sequential.Greens,
template = 'simple_white',
text_auto='.2f')
fig.update_xaxes(side="top")
fig.update_coloraxes(colorbar_thickness = 30,
colorbar_tickfont = dict(size = 20),
colorbar_len = 0.6,
colorbar_ticklabelposition = 'outside top')
fig.update_layout(margin = dict(t=200,r=200,b=200,l=200),
xaxis = dict(linecolor = 'white'),
yaxis = dict(linecolor = 'white'),
showlegend = False,
width = 700, height = 700,
autosize = False )
fig.update_layout(
font=dict(
size=14, # Set the font size here
)
)
fig.update_layout(
margin=dict(r=0, b=0, t = 0, l=0,pad=20)
)
# fig.update_layout(
# # xaxis=dict(gridcolor='lightgrey'),
# margin=dict(r=2, b=50, pad=10)
# )
#x axis
# fig.update_xaxes(visible=False)
#y axis
# fig.update_yaxes(visible=False)
fig.show()
# fig.write_image("outputFiles/Figure_2a.pdf")
# Load Dataset
df_facet_compare = pd.read_csv('Entrepreneurs_and_Employee_Scores_Anon.csv')
df_facet_compare = df_facet_compare.set_index('User')
df_compaer_graph = pd.DataFrame(df_facet_compare.set_index('Target').stack()).reset_index()
df_compaer_graph.columns = ['Target','Personality','Value']
df_compaer_graph['Personality'] = df_compaer_graph['Personality'].apply(lambda x: x.replace('_facet_',' (').replace('_percentile',')').replace('Emotional range','Emotional Stability').replace('_','-'))
# Manipulate the dataset for the figrue
df_sig_comper = df_compaer_graph[df_compaer_graph['Personality'].str.contains('adventurousness|modesty|activity-level|anxiety|immoderation|trust')]
df_sig_comper = pd.concat([df_sig_comper[df_sig_comper['Personality'] == 'Openness (adventurousness)'],
df_sig_comper[df_sig_comper['Personality'] == 'Agreeableness (modesty)'],
df_sig_comper[df_sig_comper['Personality'] == 'Extraversion (activity-level)'],
df_sig_comper[df_sig_comper['Personality'] == 'Emotional Stability (anxiety)'],
df_sig_comper[df_sig_comper['Personality'] == 'Emotional Stability (immoderation)'],
df_sig_comper[df_sig_comper['Personality'] == 'Agreeableness (trust)']])
# Figure 1B
sns.set(font_scale = 4)
sns.set_style("whitegrid", {'legend.frameon':False})
plt.figure(figsize=(20,35))
ax = sns.violinplot(x="Value", y="Personality", hue="Target",
data=df_sig_comper,
palette=['.4', '.8'],
split=True, linewidth = 2)
colors = [sns.color_palette("Paired")[1], sns.color_palette("Paired")[0],
sns.color_palette("Greens")[4], sns.color_palette("Greens")[2],
sns.color_palette("Paired")[7], sns.color_palette("Paired")[6],
sns.color_palette("Paired")[9], sns.color_palette("Paired")[8],
sns.color_palette("Paired")[9], sns.color_palette("Paired")[8],
sns.color_palette("Greens")[4], sns.color_palette("Greens")[2]]
for ind, violin in enumerate(ax.findobj(PolyCollection)):
rgb = to_rgb(colors[ind])
violin.set_facecolor(rgb)
ax.tick_params(axis='x', colors='black')
ax.tick_params(axis='y', colors='black')
sns.set(font_scale = 4)
plt.setp(ax.collections, alpha=.6)
ax.set(xlabel="Percentile Score of Personality Facet")
ax.set(ylabel=None)
ax.xaxis.set_label_coords(.5, -.06)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), facecolor = 'white')
sns.despine(left=True,bottom = True)
plt.margins(x=0, y=0)
fig = ax.get_figure()
df_facet_matrix = pd.read_excel('Founders_Cofounders_Twitter_Personality_Matrix_Anon_diff.xlsx')
df_facet_matrix_raw = df_facet_matrix
df_facet_matrix = df_facet_matrix_raw
# Load and Clean Dataset
#df_facet_matrix = pd.read_excel('Founders and Cofounders with Twitter (40k) - Personality Matrix.xlsx')
df_facet_matrix_filter = df_facet_matrix.loc[:,df_facet_matrix.columns.str.contains('facet') & df_facet_matrix.columns.str.contains('raw_score') | df_facet_matrix.columns.str.contains('User')]
print (df_facet_matrix_filter.shape)
df_facet_matrix_filter = df_facet_matrix_filter.drop_duplicates('User')
print (df_facet_matrix_filter.shape)
df_facet_matrix_filter = df_facet_matrix_filter.set_index('User')
print (df_facet_matrix_filter.shape)
df_facet_matrix_filter = df_facet_matrix_filter[~df_facet_matrix_filter['Openness_facet_adventurousness_raw_score'].isna()]
print (df_facet_matrix_filter.shape)
(42266, 31) (40580, 31) (40580, 30) (32732, 30)
df_facet_matrix_filter = df_facet_matrix_filter.sample(1000)
# Apply t-SNE
RS = 123
# time_start = time.time()
# df_input = df_inno_eco_matrix
tsne = TSNE(random_state=RS).fit_transform(df_facet_matrix_filter)
hopkins(tsne,tsne.shape[0])
The following code takes very long to run:
# Dendrogram
sns.set(font_scale = 1)
dendrogram = sch.dendrogram(sch.linkage(tsne, method = "ward"))
plt.title('Dendrogram')
plt.xlabel('Founders')
plt.ylabel('Euclidean distances')
plt.show()
The following code takes very long to run:
# Number of Cluster Selection
num_cluster_range = list(range(2,31))
# Davies Bouldin Index - The smaller the better
db_score_list = []
for i in num_cluster_range:
hc_test = AgglomerativeClustering(n_clusters = i, metric = 'euclidean', linkage ='ward')
y_hc_test = hc_test.fit_predict(tsne)
db_score_list.append(davies_bouldin_score(tsne, y_hc_test))
# Silhouette Coefficient - The larger the better
sil_score_list = []
for i in num_cluster_range:
hc_test = AgglomerativeClustering(n_clusters = i, metric = 'euclidean', linkage ='ward')
y_hc_test = hc_test.fit_predict(tsne)
sil_score_list.append(silhouette_score(tsne, y_hc_test))
# Calinski Harabas Index - The larger the better
ch_score_list = []
for i in num_cluster_range:
hc_test = AgglomerativeClustering(n_clusters = i, metric = 'euclidean', linkage ='ward')
y_hc_test = hc_test.fit_predict(tsne)
ch_score_list.append(calinski_harabasz_score(tsne, y_hc_test))
# Dunn Index - The larger the better
dunn_score_list = []
for i in num_cluster_range:
hc_test = AgglomerativeClustering(n_clusters = i, metric = 'euclidean', linkage ='ward')
y_hc_test = hc_test.fit_predict(tsne)
dist = pairwise_distances(tsne)
dunn_score_list.append(dunn(dist, y_hc_test))
pio.templates.default = "plotly_white"
# Create traces
fig = go.Figure()
fig.add_vline(x=6, line_width=1, line_dash="dash", line_color="grey")
fig.add_trace(go.Scatter(x=num_cluster_range, y=scale(db_score_list),
mode='lines', name='Davies Bouldin Index',
line=dict(color='#d62728', width=2.5)))
fig.add_trace(go.Scatter(x=num_cluster_range, y=scale(sil_score_list),
mode='lines', name='Silhouette Coefficient',
line=dict(color='#1f77b4', width=2.5, dash='dash')))
fig.add_trace(go.Scatter(x=num_cluster_range, y=scale(ch_score_list),
mode='lines', name='Calinski Harabas Index',
line=dict(color='#ff7f0e', width=2.5, dash='dashdot')))
fig.add_trace(go.Scatter(x=num_cluster_range, y=scale(dunn_score_list),
mode='lines', name='Dunn Index',
line=dict(color='#9467bd', width=2.5, dash='dot')))
fig.update_layout(
# title='Clustering Quality VS Number of Clusters',
xaxis_title='Number of Clusters',
yaxis_title='Scaled Scores')
fig.show()
# Hierarchical Clustering
hc = AgglomerativeClustering(n_clusters = 6, metric = 'euclidean', linkage ='ward')
y_hc=hc.fit_predict(tsne)
df_6clusters = df_facet_matrix_filter.copy()
df_6clusters['Hierarchical_label'] = y_hc
# Figure 1C
clustering_colors = ['rgb(230,245,152)','rgb(252,141,89)','rgb(50,136,189)','rgb(213,62,79)','rgb(153,213,148)','rgb(254,224,139)']
df_clustering = pd.concat([df_6clusters.reset_index(), pd.DataFrame(tsne, columns = ['x','y'])], axis = 1)
df_clustering['Hierarchical_label'] = df_clustering['Hierarchical_label'].apply(lambda x: str(x).replace('1','6').replace('2','7').replace('3','8').replace('4','9').replace('5','-1').replace('9','1').replace('-1','2').replace('6','3').replace('8','4').replace('7','5'))
df_clustering['Hierarchical_label'] = df_clustering['Hierarchical_label'].astype(int)
df_clustering = df_clustering[['x','y','Hierarchical_label']].copy()
df_clustering['Hierarchical_label'] = df_clustering['Hierarchical_label'].apply(lambda x: str(x).replace('0','Accomplisher').replace('1','Engineer').replace('2','Leader').replace('3','Developer').replace('4','Operator').replace('5','Fighter'))
df_clustering = df_clustering.sort_values('Hierarchical_label')
df_textbox = df_clustering.groupby('Hierarchical_label')[['x','y']].mean().reset_index()
fig = px.scatter(df_clustering, x="x", y="y", color="Hierarchical_label",
hover_data=['Hierarchical_label'], template = "plotly_white",
color_discrete_sequence=clustering_colors,
# color_discrete_sequence=px.colors.qualitative.Prism,
opacity = 0.2, width=1000, height=800)
fig.update_layout({
'plot_bgcolor': 'rgba(0,0,0,0)',
'paper_bgcolor': 'rgba(0,0,0,0)'
})
fig.update_layout(yaxis=dict(title=''),
xaxis=dict(title=''))
fig.update_layout(showlegend=False)
fig.update_xaxes(showline = True, linecolor = 'grey', linewidth = 1, row = 1, col = 1, mirror = True)
fig.update_yaxes(showline = True, linecolor = 'grey', linewidth = 1, row = 1, col = 1, mirror = True)
fig.update_xaxes(title="t-SNE dimension 1", title_font=dict(size=16))
fig.update_yaxes(title="t-SNE dimension 2", title_font=dict(size=16))
for index, text, x,y in df_textbox.itertuples(index=True):
if (text == 'Accomplisher') or (text == 'Leader'):
text_edit = '<b>' + text + '</b>'
elif (text == 'Operator') or (text == 'Developer') or (text == 'Fighter'):
text_edit = '<b>' + text + '</b>'
else:
text_edit = '<b>' + text + '</b>'
fig.add_annotation(
x=x, y=y,
text=' <br> ',
showarrow=False,
xref="x",
yref="y",
font=dict(
size=14,
color='black'
),
align="center",
bordercolor=clustering_colors[index],
# bordercolor=px.colors.qualitative.Prism[index],
borderwidth=0,
borderpad=8,
# bgcolor=clustering_colors[index],
# bgcolor=px.colors.qualitative.Prism[index],
opacity=0.4
)
fig.add_annotation(
x=x, y=y,
text=text_edit,
showarrow=False,
xref="x",
yref="y",
font=dict(
size=18,
color='black'
),
align="center",
opacity=1
)
fig.update_layout(
# xaxis=dict(gridcolor='lightgrey'),
margin=dict(r=50, b=0, t = 50, l=0,pad=20)
)
fig.show()
# fig.write_image("outputFiles/Figure_2c.svg")
df_facet_percentile_compare = df_facet_matrix_raw
# Load and Clean Dataset
#df_facet_percentile_compare = pd.read_excel('Founders and Cofounders with Twitter (40k) - Personality Matrix.xlsx')
df_facet_percentile_compare_filter = df_facet_percentile_compare.loc[:,df_facet_percentile_compare.columns.str.contains('facet') & df_facet_percentile_compare.columns.str.contains('percentile') | df_facet_percentile_compare.columns.str.contains('User')]
print (df_facet_percentile_compare_filter.shape)
df_facet_percentile_compare_filter = df_facet_percentile_compare_filter.drop_duplicates('User')
print (df_facet_percentile_compare_filter.shape)
df_facet_percentile_compare_filter = df_facet_percentile_compare_filter.set_index('User')
print (df_facet_percentile_compare_filter.shape)
df_facet_percentile_compare_filter = df_facet_percentile_compare_filter[~df_facet_percentile_compare_filter['Openness_facet_adventurousness_percentile'].isna()]
print (df_facet_percentile_compare_filter.shape)
df_clustering = pd.concat([df_6clusters.reset_index(), pd.DataFrame(tsne, columns = ['x','y'])], axis = 1)
df_clustering['Hierarchical_label'] = df_clustering['Hierarchical_label'].apply(lambda x: str(x).replace('1','6').replace('2','7').replace('3','8').replace('4','9').replace('5','-1').replace('9','1').replace('-1','2').replace('6','3').replace('8','4').replace('7','5'))
df_clustering['Hierarchical_label'] = df_clustering['Hierarchical_label'].astype(int)
df_clustering = df_clustering[['User','Hierarchical_label']].copy()
df_clustering['Hierarchical_label'] = df_clustering['Hierarchical_label'].apply(lambda x: str(x).replace('0','Accomplisher').replace('1','Engineer').replace('2','Leader').replace('3','Developer').replace('4','Operator').replace('5','Fighter'))
df_facet_percentile_compare_filter = df_clustering.merge(df_facet_percentile_compare_filter, how = 'left', left_on = 'User', right_on = 'User')
df_facet_percentile_compare_filter = df_facet_percentile_compare_filter.set_index('User')
df_heatmap_median = df_facet_percentile_compare_filter.groupby('Hierarchical_label').median().T
df_heatmap_median.index = ['{0} ({1})'.format(i.replace('_percentile','').replace('_facet','').replace('Emotional range','Emotional Stability').split('_')[0],'-'.join(i.replace('_percentile','').replace('_facet','').replace('Emotional range','Emotional Stability').split('_')[1:])) for i in df_heatmap_median.index.tolist()]
# Figure 1D
im = sns.clustermap(df_heatmap_median, cmap='Greens', linewidth = 1)
ax = im.ax_heatmap
ax.set_xlabel("")
labels = ['Engineer','Fighter','Developer','Operator','Accomplisher','Leader']
ax.set_xticklabels(labels, rotation=90, ha='center')
plt.show()
fig = ax.get_figure()
# Extended Data Figure 4
# Load Dataset
df_facet_compare_entrepreneur = pd.read_csv('Entrepreneurs_and_Employee_Scores_Anon.csv')
df_facet_compare_entrepreneur = df_facet_compare_entrepreneur.set_index('User')
#measure cohens d function
def cohen_d(x,y):
nx = len(x)
ny = len(y)
dof = nx + ny - 2
return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)
# Organize the results
facet_test_list = []
for facet in df_facet_compare_entrepreneur.columns.tolist()[:30]:
row_dict = {}
boss_sample = df_facet_compare_entrepreneur[df_facet_compare_entrepreneur['Target'] == 'Entrepreneur'][facet].tolist()
employee_sample = df_facet_compare_entrepreneur[df_facet_compare_entrepreneur['Target'] == 'Employee'][facet].tolist()
row_dict['Big 5 Personality Facets'] = facet
row_dict["Cohen's D"] = cohen_d(boss_sample, employee_sample)
res = scipy.stats.ttest_ind(boss_sample, employee_sample, equal_var=False)
row_dict['p Value'] = res.pvalue
facet_test_list.append(row_dict)
df_facets_cohen_d = pd.DataFrame(facet_test_list)
df_facets_cohen_d['abs_cohen_d'] = df_facets_cohen_d["Cohen's D"].apply(lambda x: np.abs(x))
df_facets_cohen_d.sort_values('abs_cohen_d', ascending = False)
df_facets_cohen_d['Effect Size'] = np.nan
df_facets_cohen_d.loc[(df_facets_cohen_d['abs_cohen_d'] <= 0.2), 'Effect Size'] = 'Trivial'
df_facets_cohen_d.loc[(df_facets_cohen_d['abs_cohen_d'] > 0.2) & (df_facets_cohen_d['abs_cohen_d'] <= 0.4), 'Effect Size'] = 'Small'
df_facets_cohen_d.loc[(df_facets_cohen_d['abs_cohen_d'] > 0.4) & (df_facets_cohen_d['abs_cohen_d'] <= 0.65), 'Effect Size'] = 'Medium'
df_facets_cohen_d.loc[(df_facets_cohen_d['abs_cohen_d'] > 0.65), 'Effect Size'] = 'Large'
df_facets_cohen_d = df_facets_cohen_d.sort_values('abs_cohen_d', ascending = False)
df_facets_cohen_d = df_facets_cohen_d[['Big 5 Personality Facets', "Cohen's D", 'p Value', 'Effect Size']]
df_facets_cohen_d
| Big 5 Personality Facets | Cohen's D | p Value | Effect Size | |
|---|---|---|---|---|
| 1 | Openness_facet_adventurousness_percentile | 0.920176 | 0.000000e+00 | Large |
| 21 | Agreeableness_facet_modesty_percentile | -0.792602 | 0.000000e+00 | Large |
| 13 | Extraversion_facet_activity_level_percentile | 0.776164 | 0.000000e+00 | Large |
| 26 | Emotional range_facet_anxiety_percentile | -0.770742 | 0.000000e+00 | Large |
| 28 | Emotional range_facet_immoderation_percentile | -0.734884 | 0.000000e+00 | Large |
| 24 | Agreeableness_facet_trust_percentile | 0.720660 | 0.000000e+00 | Large |
| 25 | Emotional range_facet_anger_percentile | -0.679333 | 8.705857e-279 | Large |
| 27 | Emotional range_facet_depression_percentile | -0.677018 | 1.784581e-262 | Large |
| 20 | Agreeableness_facet_cooperation_percentile | 0.669759 | 1.558560e-285 | Large |
| 3 | Openness_facet_emotionality_percentile | -0.666483 | 7.568027e-281 | Large |
| 7 | Conscientiousness_facet_achievement_striving_p... | 0.587171 | 1.032597e-224 | Medium |
| 11 | Conscientiousness_facet_self_discipline_percen... | 0.548171 | 4.828852e-186 | Medium |
| 8 | Conscientiousness_facet_cautiousness_percentile | 0.503654 | 2.582571e-168 | Medium |
| 5 | Openness_facet_intellect_percentile | 0.446235 | 6.599245e-133 | Medium |
| 12 | Conscientiousness_facet_self_efficacy_percentile | 0.433243 | 1.990471e-125 | Medium |
| 14 | Extraversion_facet_assertiveness_percentile | 0.431710 | 4.392563e-122 | Medium |
| 6 | Openness_facet_liberalism_percentile | 0.426033 | 1.113057e-119 | Medium |
| 23 | Agreeableness_facet_sympathy_percentile | -0.425556 | 2.378425e-108 | Medium |
| 9 | Conscientiousness_facet_dutifulness_percentile | 0.412815 | 2.876272e-110 | Medium |
| 10 | Conscientiousness_facet_orderliness_percentile | 0.308178 | 2.537543e-63 | Small |
| 17 | Extraversion_facet_friendliness_percentile | 0.236295 | 1.677218e-34 | Small |
| 16 | Extraversion_facet_excitement_seeking_percentile | -0.214609 | 1.261846e-30 | Small |
| 29 | Emotional range_facet_self_consciousness_perce... | -0.163301 | 9.374383e-18 | Trivial |
| 4 | Openness_facet_imagination_percentile | -0.152506 | 1.906209e-16 | Trivial |
| 18 | Extraversion_facet_gregariousness_percentile | 0.145971 | 2.537291e-14 | Trivial |
| 22 | Agreeableness_facet_morality_percentile | 0.122673 | 3.144615e-11 | Trivial |
| 15 | Extraversion_facet_cheerfulness_percentile | -0.082933 | 1.014100e-05 | Trivial |
| 2 | Openness_facet_artistic_interests_percentile | 0.044849 | 1.455970e-02 | Trivial |
| 19 | Agreeableness_facet_altruism_percentile | -0.044765 | 1.793611e-02 | Trivial |
| 0 | Unnamed: 0 | -0.018811 | 3.314753e-01 | Trivial |
# Extended Data Figure 5
df_entrepreneurship_heatmap = df_facet_compare_entrepreneur.groupby('Target').median().T.tail(30)
im = sns.clustermap(df_entrepreneurship_heatmap, cmap='Greens')
plt.show()
# Extended Data Figure 6
# Dictionary to store the Results
hopkins_score_dict = {}
# Car Dataset
df_car = pd.read_csv('https://raw.githubusercontent.com/Akankhya123/EDA-on-Car-Features/master/data.csv')
df_car.drop(['Engine Fuel Type', 'Market Category', 'Number of Doors'],axis=1,inplace=True)
df_car = df_car.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price"})
df_car = df_car.drop_duplicates()
df_car = df_car.dropna()
df_car = df_car[['Year', 'HP', 'Cylinders', 'MPG-H', 'MPG-C','Popularity']]
hopkins_score_dict['Car Features'] = 1 - hopkins(df_car.values,df_car.shape[0])
# Athletes Dataset
df_athletes = pd.read_csv('https://raw.githubusercontent.com/flother/rio2016/master/athletes.csv')
df_events = pd.read_csv('https://raw.githubusercontent.com/flother/rio2016/master/events.csv')
df_athletes['sex'] = df_athletes['sex'].apply(lambda x: x.replace('female','1').replace('male','0'))
df_athletes['sex'] = df_athletes['sex'].apply(lambda x: int(x))
df_athletes = df_athletes[['height','weight','gold','silver','bronze','sex']]
df_athletes = df_athletes[~df_athletes['weight'].isna()]
df_athletes = df_athletes[~df_athletes['height'].isna()]
hopkins_score_dict['Olympic Athletes'] = 1 - hopkins(df_athletes.values,df_athletes.shape[0])
# t-sne two dimensions
hopkins_score_dict['2-dimensions'] = 1- hopkins(tsne,tsne.shape[0])
# 32k Founders 30 dimensions
hopkins_score_dict['Founders (32k)'] = 1- hopkins(df_facet_matrix_filter,df_facet_matrix_filter.shape[0])
# Iris Dataset
df_iris = scale(datasets.load_iris().data)
hopkins_score_dict['Irises (test)'] = 1 - hopkins(df_iris,df_iris.shape[0])
# Penguins Dataset
df_penguins,y = load_penguins(return_X_y = True)
df_penguins = df_penguins[~df_penguins['bill_length_mm'].isna()]
hopkins_score_dict['Penguins'] = 1 - hopkins(df_penguins.values,df_penguins.shape[0])
# Random Dataset
df = pd.DataFrame()
for i in df_facet_matrix_filter.columns.tolist():
new_min = df_facet_matrix_filter[i].min()
new_max = df_facet_matrix_filter[i].max()
df[i] = [random() for i in range(df_facet_matrix_filter.shape[0])]
old_min = df[i].min()
old_max = df[i].max()
df[i] = df[i].apply(lambda x: (x-old_min)/(old_max - old_min) * (new_max - new_min) + new_min)
hopkins_score_dict['Random (32k)'] = hopkins(df.values,df.shape[0])
df_hopkins_compare = pd.DataFrame.from_dict(hopkins_score_dict, orient = 'index').reset_index()
df_hopkins_compare.columns = ['Datasets','Hopkins Statistic']
# Extended Data Figure 6
sns.set_style("whitegrid")
cols = ['lightgreen' if (x == '2-dimensions') or (x == 'Founders (32k)') else 'lightgray' for x in df_hopkins_compare['Datasets']]
ax = sns.barplot(x="Datasets", y="Hopkins Statistic", data=df_hopkins_compare, palette=cols)
ax.set(xlabel=None)
ax.set(ylabel=None)
ax.set(title = 'Hopkins Statistic')
sns.despine(left=True)
plt.xticks(rotation=45)
(array([0, 1, 2, 3, 4, 5, 6]), [Text(0, 0, 'Car Features'), Text(1, 0, 'Olympic Athletes'), Text(2, 0, '2-dimensions'), Text(3, 0, 'Founders (32k)'), Text(4, 0, 'Irises (test)'), Text(5, 0, 'Penguins'), Text(6, 0, 'Random (32k)')])
# Extended Data Figure 7
sns.set(font_scale = 1)
dendrogram = sch.dendrogram(sch.linkage(tsne, method = "ward"))
plt.title('Dendrogram')
plt.xlabel('Founders')
plt.ylabel('Euclidean distances')
plt.show()
# Extended Data Figure 8
pio.templates.default = "plotly_white"
# Create traces
fig = go.Figure()
fig.add_vline(x=6, line_width=1, line_dash="dash", line_color="grey")
fig.add_trace(go.Scatter(x=num_cluster_range, y=scale(db_score_list),
mode='lines', name='Davies Bouldin Index',
line=dict(color='#d62728', width=2.5)))
fig.add_trace(go.Scatter(x=num_cluster_range, y=scale(sil_score_list),
mode='lines', name='Silhouette Coefficient',
line=dict(color='#1f77b4', width=2.5, dash='dash')))
fig.add_trace(go.Scatter(x=num_cluster_range, y=scale(ch_score_list),
mode='lines', name='Calinski Harabas Index',
line=dict(color='#ff7f0e', width=2.5, dash='dashdot')))
fig.add_trace(go.Scatter(x=num_cluster_range, y=scale(dunn_score_list),
mode='lines', name='Dunn Index',
line=dict(color='#9467bd', width=2.5, dash='dot')))
fig.update_layout(
# title='Clustering Quality VS Number of Clusters',
xaxis_title='Number of Clusters',
yaxis_title='Scaled Scores')
fig.show()
# Extended Data Figure 8
df_6clusters_copy = df_6clusters.copy()
df_6clusters_copy['Hierarchical_label'] = df_6clusters_copy['Hierarchical_label'].apply(lambda x: str(x).replace('1','6').replace('2','7').replace('3','8').replace('4','9').replace('5','-1').replace('9','1').replace('-1','2').replace('6','3').replace('8','4').replace('7','5'))
df_6clusters_copy['Hierarchical_label'] = df_6clusters_copy['Hierarchical_label'].astype(int)
df_counts = pd.DataFrame(df_6clusters_copy['Hierarchical_label'].value_counts())
df_counts = df_counts.reset_index()
df_counts.columns = ['Cluster','Counts']
df_counts['Cluster'] = df_counts['Cluster'].apply(lambda x: str(int(x)))
sns.set_theme(style="white")
ax = sns.barplot(x="Cluster", y="Counts", data=df_counts.sort_values('Cluster'))
## This dataset is a very small subset of the original dataset due to privacy reasons
df_model_v1 = pd.read_csv("ExtendedDataFigure19_Subset.csv")
col_name = df_model_v1.columns.values[1:87]
fmla = "success ~ {0}".format(' + '.join(col_name))
# Run the statistical model
model_v1 = sm.formula.glm(fmla,
family=sm.families.Binomial(), data=df_model_v1).fit()
print(model_v1.summary())
# Organize the results as a table
results_as_html = model_v1.summary().tables[1].as_html()
df_results = pd.read_html(results_as_html, header=0, index_col=0)[0]
df_results_plot = df_results[df_results['P>|z|']<0.05].sort_values('coef')
df_results_plot.rename(columns = {'P>|z|':'p-value'}, inplace = True)
df_results_plot = df_results_plot.reset_index()
df_results_plot
df_results_plot['Factors'] = np.nan
df_results_plot.loc[df_results_plot['index'].str.contains('ind_'),'Factors'] = 'Industry'
df_results_plot.loc[df_results_plot['index'].str.contains('combo'),'Factors'] = 'Combination'
df_results_plot.loc[df_results_plot['index'].str.contains('big5_'),'Factors'] = 'big5'
df_results_plot.loc[df_results_plot['index'].str.contains('facet_'),'Factors'] = 'Facets'
df_results_plot['Factors'] = df_results_plot['Factors'].fillna('Others')
# df_results_plot['Error'] = df_results_plot['std err'].apply(lambda x: 2*x)
df_results_plot = pd.concat([df_results_plot[df_results_plot['Factors'] == 'big5'],
df_results_plot[df_results_plot['Factors'] == 'Facets'],
df_results_plot[df_results_plot['Factors'] == 'Combination'],
df_results_plot[df_results_plot['Factors'] == 'Industry'],
df_results_plot[df_results_plot['Factors'] == 'Others']])
C:\Users\schoo\anaconda3\lib\site-packages\statsmodels\genmod\families\links.py:198: RuntimeWarning: overflow encountered in exp t = np.exp(-z) C:\Users\schoo\anaconda3\lib\site-packages\statsmodels\genmod\families\family.py:1056: RuntimeWarning: divide by zero encountered in log special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) + C:\Users\schoo\anaconda3\lib\site-packages\statsmodels\genmod\families\family.py:1056: RuntimeWarning: invalid value encountered in multiply special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: success No. Observations: 4152
Model: GLM Df Residuals: 3968
Model Family: Binomial Df Model: 183
Link Function: Logit Scale: 1.0000
Method: IRLS Log-Likelihood: nan
Date: Thu, 14 Mar 2024 Deviance: 1.3023e+05
Time: 12:18:57 Pearson chi2: 6.37e+18
No. Iterations: 100 Pseudo R-squ. (CS): nan
Covariance Type: nonrobust
===========================================================================================================================
coef std err z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------------------------------------
Intercept -2.438e+15 2.71e+07 -8.99e+07 0.000 -2.44e+15 -2.44e+15
org_country[T.AUS] 3.036e+15 5.06e+07 6e+07 0.000 3.04e+15 3.04e+15
org_country[T.BGD] 24.1838 3.17e-07 7.64e+07 0.000 24.184 24.184
org_country[T.BRA] 3.935e+15 5.9e+07 6.67e+07 0.000 3.94e+15 3.94e+15
org_country[T.CAN] 3.608e+15 4.91e+07 7.35e+07 0.000 3.61e+15 3.61e+15
org_country[T.CHE] 4.586e+15 6.21e+07 7.39e+07 0.000 4.59e+15 4.59e+15
org_country[T.CHL] 8.478e+13 5.9e+07 1.44e+06 0.000 8.48e+13 8.48e+13
org_country[T.CHN] 8.936e+15 8.63e+07 1.04e+08 0.000 8.94e+15 8.94e+15
org_country[T.COL] 9.732e+14 8.57e+07 1.13e+07 0.000 9.73e+14 9.73e+14
org_country[T.CZE] -4.501e+14 8.29e+07 -5.43e+06 0.000 -4.5e+14 -4.5e+14
org_country[T.DEU] 3.819e+15 5.04e+07 7.58e+07 0.000 3.82e+15 3.82e+15
org_country[T.DNK] 6.694e+14 6.23e+07 1.08e+07 0.000 6.69e+14 6.69e+14
org_country[T.EGY] -1.713e+13 8.36e+07 -2.05e+05 0.000 -1.71e+13 -1.71e+13
org_country[T.ESP] 4.23e+15 4.87e+07 8.68e+07 0.000 4.23e+15 4.23e+15
org_country[T.FIN] 3.892e+15 5.56e+07 6.99e+07 0.000 3.89e+15 3.89e+15
org_country[T.FRA] 3.533e+15 5.01e+07 7.05e+07 0.000 3.53e+15 3.53e+15
org_country[T.GBR] 3.626e+15 4.85e+07 7.47e+07 0.000 3.63e+15 3.63e+15
org_country[T.HKG] -6.513e+13 6.81e+07 -9.56e+05 0.000 -6.51e+13 -6.51e+13
org_country[T.IDN] 2.491e+15 6.81e+07 3.66e+07 0.000 2.49e+15 2.49e+15
org_country[T.IND] 3.621e+15 4.86e+07 7.44e+07 0.000 3.62e+15 3.62e+15
org_country[T.IRL] 2.794e+15 5.19e+07 5.39e+07 0.000 2.79e+15 2.79e+15
org_country[T.ISR] 3.968e+15 5.17e+07 7.67e+07 0.000 3.97e+15 3.97e+15
org_country[T.ITA] -3.187e+14 5.41e+07 -5.89e+06 0.000 -3.19e+14 -3.19e+14
org_country[T.JPN] 4.126e+15 6.79e+07 6.08e+07 0.000 4.13e+15 4.13e+15
org_country[T.KOR] 1.881e+15 6.78e+07 2.77e+07 0.000 1.88e+15 1.88e+15
org_country[T.MEX] 4.389e+13 6.87e+07 6.39e+05 0.000 4.39e+13 4.39e+13
org_country[T.NGA] 1.035e+13 6.22e+07 1.66e+05 0.000 1.03e+13 1.03e+13
org_country[T.NLD] 3.249e+15 5.22e+07 6.22e+07 0.000 3.25e+15 3.25e+15
org_country[T.NOR] -3.522e+14 8.3e+07 -4.25e+06 0.000 -3.52e+14 -3.52e+14
org_country[T.NPL] -4.938e+14 8.31e+07 -5.94e+06 0.000 -4.94e+14 -4.94e+14
org_country[T.POL] 8.327e+15 8.31e+07 1e+08 0.000 8.33e+15 8.33e+15
org_country[T.PRT] -5.81e+14 8.3e+07 -7e+06 0.000 -5.81e+14 -5.81e+14
org_country[T.ROM] 2.789e+15 6.19e+07 4.5e+07 0.000 2.79e+15 2.79e+15
org_country[T.RUS] 2.177e+14 6.8e+07 3.2e+06 0.000 2.18e+14 2.18e+14
org_country[T.SGP] 4.076e+15 6.24e+07 6.53e+07 0.000 4.08e+15 4.08e+15
org_country[T.SWE] 3.678e+15 6.21e+07 5.93e+07 0.000 3.68e+15 3.68e+15
org_country[T.THA] 8.203e+15 8.37e+07 9.8e+07 0.000 8.2e+15 8.2e+15
org_country[T.TUR] -5.189e+14 5.92e+07 -8.77e+06 0.000 -5.19e+14 -5.19e+14
org_country[T.UKR] -6.511e+14 6.81e+07 -9.56e+06 0.000 -6.51e+14 -6.51e+14
org_country[T.USA] 3.798e+15 4.84e+07 7.85e+07 0.000 3.8e+15 3.8e+15
org_country[T.ZAF] -3.219e+14 6.21e+07 -5.18e+06 0.000 -3.22e+14 -3.22e+14
combo[T.Ac1De1] 1.219e+14 1.1e+07 1.1e+07 0.000 1.22e+14 1.22e+14
combo[T.Ac1De1Fi1] 4.299e+15 4.85e+07 8.86e+07 0.000 4.3e+15 4.3e+15
combo[T.Ac1De1Op1] -4.193e+15 6.81e+07 -6.15e+07 0.000 -4.19e+15 -4.19e+15
combo[T.Ac1De2] -4.396e+15 6.77e+07 -6.49e+07 0.000 -4.4e+15 -4.4e+15
combo[T.Ac1En1] -8.59e+14 1.77e+07 -4.85e+07 0.000 -8.59e+14 -8.59e+14
combo[T.Ac1En1De1] -5.013e+15 6.94e+07 -7.22e+07 0.000 -5.01e+15 -5.01e+15
combo[T.Ac1En1Fi1] -4.537e+15 6.85e+07 -6.63e+07 0.000 -4.54e+15 -4.54e+15
combo[T.Ac1En2] 3.982e+15 6.82e+07 5.84e+07 0.000 3.98e+15 3.98e+15
combo[T.Ac1Fi1] 9.658e+13 1.47e+07 6.59e+06 0.000 9.66e+13 9.66e+13
combo[T.Ac1Le1] -6.376e+14 1.4e+07 -4.57e+07 0.000 -6.38e+14 -6.38e+14
combo[T.Ac1Le1De1] -1.828e+15 3.94e+07 -4.64e+07 0.000 -1.83e+15 -1.83e+15
combo[T.Ac1Le2] -6.433e+14 4.82e+07 -1.33e+07 0.000 -6.43e+14 -6.43e+14
combo[T.Ac1Op1] 8.138e+14 1.56e+07 5.22e+07 0.000 8.14e+14 8.14e+14
combo[T.Ac1Op1Fi1] -4.298e+15 6.89e+07 -6.24e+07 0.000 -4.3e+15 -4.3e+15
combo[T.Ac2] 5.92e+14 1.09e+07 5.41e+07 0.000 5.92e+14 5.92e+14
combo[T.Ac2De1] -4.84e+15 3.93e+07 -1.23e+08 0.000 -4.84e+15 -4.84e+15
combo[T.Ac2En1] 6.526e+14 4.9e+07 1.33e+07 0.000 6.53e+14 6.53e+14
combo[T.Ac2Fi1] 4.284e+15 6.85e+07 6.25e+07 0.000 4.28e+15 4.28e+15
combo[T.Ac2Le1] -1.883e+15 4.82e+07 -3.91e+07 0.000 -1.88e+15 -1.88e+15
combo[T.Ac2Le1Fi1] 2.889e+15 6.93e+07 4.17e+07 0.000 2.89e+15 2.89e+15
combo[T.Ac2Op1] -9.186e+14 4.84e+07 -1.9e+07 0.000 -9.19e+14 -9.19e+14
combo[T.Ac3] -7.823e+14 4.84e+07 -1.62e+07 0.000 -7.82e+14 -7.82e+14
combo[T.De1] -1.457e+13 4.97e+06 -2.93e+06 0.000 -1.46e+13 -1.46e+13
combo[T.De1Fi1] -1.679e+14 1.09e+07 -1.53e+07 0.000 -1.68e+14 -1.68e+14
combo[T.De1Fi2] -1.007e+14 4.03e+07 -2.5e+06 0.000 -1.01e+14 -1.01e+14
combo[T.De1Op1] 1.905e+15 1.75e+07 1.09e+08 0.000 1.91e+15 1.91e+15
combo[T.De1Op1Fi1] -4.839e+15 4.91e+07 -9.85e+07 0.000 -4.84e+15 -4.84e+15
combo[T.De2] -2.381e+14 1.25e+07 -1.9e+07 0.000 -2.38e+14 -2.38e+14
combo[T.De2Fi1] 2.643e+15 3.96e+07 6.68e+07 0.000 2.64e+15 2.64e+15
combo[T.De2Op1] 4.29e+15 6.77e+07 6.33e+07 0.000 4.29e+15 4.29e+15
combo[T.En1] -8.336e+12 6.5e+06 -1.28e+06 0.000 -8.34e+12 -8.34e+12
combo[T.En1De1] 5.012e+13 1.22e+07 4.12e+06 0.000 5.01e+13 5.01e+13
combo[T.En1De1Fi1] -7.703e+14 2.91e+07 -2.65e+07 0.000 -7.7e+14 -7.7e+14
combo[T.En1De1Op1] -3.81e+14 3.97e+07 -9.59e+06 0.000 -3.81e+14 -3.81e+14
combo[T.En1De2] -3.599e+15 6.81e+07 -5.29e+07 0.000 -3.6e+15 -3.6e+15
combo[T.En1Fi1] 3.359e+14 1.57e+07 2.14e+07 0.000 3.36e+14 3.36e+14
combo[T.En1Fi2] -4.194e+15 6.84e+07 -6.14e+07 0.000 -4.19e+15 -4.19e+15
combo[T.En1Le1] -8.939e+13 1.4e+07 -6.4e+06 0.000 -8.94e+13 -8.94e+13
combo[T.En1Le1De1] 4.678e+15 6.92e+07 6.76e+07 0.000 4.68e+15 4.68e+15
combo[T.En1Op1] -5.349e+14 1.95e+07 -2.74e+07 0.000 -5.35e+14 -5.35e+14
combo[T.En2] 2.081e+13 1.43e+07 1.46e+06 0.000 2.08e+13 2.08e+13
combo[T.En2De1] -4.72e+15 6.82e+07 -6.92e+07 0.000 -4.72e+15 -4.72e+15
combo[T.En2Fi1] -2.183e+15 4.03e+07 -5.41e+07 0.000 -2.18e+15 -2.18e+15
combo[T.En2Le1] 5.344e+15 6.84e+07 7.81e+07 0.000 5.34e+15 5.34e+15
combo[T.En2Op1] -4.692e+15 6.8e+07 -6.9e+07 0.000 -4.69e+15 -4.69e+15
combo[T.Fi1] 1.206e+14 7.04e+06 1.71e+07 0.000 1.21e+14 1.21e+14
combo[T.Fi2] 7.79e+14 1.21e+07 6.46e+07 0.000 7.79e+14 7.79e+14
combo[T.Fi3] -1.259e+15 4.86e+07 -2.59e+07 0.000 -1.26e+15 -1.26e+15
combo[T.Le1] 1.234e+14 5.06e+06 2.44e+07 0.000 1.23e+14 1.23e+14
combo[T.Le1De1] -1.831e+15 1.4e+07 -1.31e+08 0.000 -1.83e+15 -1.83e+15
combo[T.Le1De1Fi1] 7.185e+14 4.03e+07 1.78e+07 0.000 7.19e+14 7.19e+14
combo[T.Le1Fi1] 1.935e+14 1.44e+07 1.35e+07 0.000 1.93e+14 1.93e+14
combo[T.Le1Fi2] -4.055e+15 6.87e+07 -5.9e+07 0.000 -4.06e+15 -4.06e+15
combo[T.Le1Fi3] 4.965e+15 6.93e+07 7.17e+07 0.000 4.97e+15 4.97e+15
combo[T.Le1Op1] -7.439e+14 2.19e+07 -3.4e+07 0.000 -7.44e+14 -7.44e+14
combo[T.Le2] -3.398e+14 1.53e+07 -2.22e+07 0.000 -3.4e+14 -3.4e+14
combo[T.Op1] -2.847e+13 5.68e+06 -5.02e+06 0.000 -2.85e+13 -2.85e+13
combo[T.Op1Fi1] 2.194e+14 1.52e+07 1.45e+07 0.000 2.19e+14 2.19e+14
combo[T.Op1Fi2] -4.723e+15 6.81e+07 -6.94e+07 0.000 -4.72e+15 -4.72e+15
combo[T.Op2] -2.421e+14 1.73e+07 -1.4e+07 0.000 -2.42e+14 -2.42e+14
combo[T.Op2Fi1] -4.306e+15 7.16e+07 -6.02e+07 0.000 -4.31e+15 -4.31e+15
olderthansix -2.438e+15 2.71e+07 -8.99e+07 0.000 -2.44e+15 -2.44e+15
Female -4.659e+14 4.16e+06 -1.12e+08 0.000 -4.66e+14 -4.66e+14
org_numfounders 1.398e+14 1.53e+06 9.15e+07 0.000 1.4e+14 1.4e+14
ind_Administrative_Services -2.296e+14 8.29e+06 -2.77e+07 0.000 -2.3e+14 -2.3e+14
ind_Advertising -2.608e+13 6.47e+06 -4.03e+06 0.000 -2.61e+13 -2.61e+13
ind_Agriculture_and_Farming -4.057e+15 3.91e+07 -1.04e+08 0.000 -4.06e+15 -4.06e+15
ind_Apps -1.953e+14 5.72e+06 -3.41e+07 0.000 -1.95e+14 -1.95e+14
ind_Artificial_Intelligence -9.234e+13 1.08e+07 -8.54e+06 0.000 -9.23e+13 -9.23e+13
ind_Biotechnology 3.933e+14 1.18e+07 3.34e+07 0.000 3.93e+14 3.93e+14
ind_Clothing_and_Apparel 1.932e+14 1.28e+07 1.51e+07 0.000 1.93e+14 1.93e+14
ind_Commerce_and_Shopping 3.221e+14 4.23e+06 7.62e+07 0.000 3.22e+14 3.22e+14
ind_Community_and_Lifestyle 1.256e+14 6.59e+06 1.91e+07 0.000 1.26e+14 1.26e+14
ind_Consumer_Electronics -4.886e+14 1.08e+07 -4.54e+07 0.000 -4.89e+14 -4.89e+14
ind_Consumer_Goods 6.959e+14 1.21e+07 5.73e+07 0.000 6.96e+14 6.96e+14
ind_Content_and_Publishing 4.298e+14 6.52e+06 6.59e+07 0.000 4.3e+14 4.3e+14
ind_Data_and_Analytics 2.306e+14 5.37e+06 4.3e+07 0.000 2.31e+14 2.31e+14
ind_Design -3.687e+14 8.27e+06 -4.46e+07 0.000 -3.69e+14 -3.69e+14
ind_Education -5.246e+12 4.78e+06 -1.1e+06 0.000 -5.25e+12 -5.25e+12
ind_Energy -4.701e+15 7.73e+07 -6.08e+07 0.000 -4.7e+15 -4.7e+15
ind_Events 3.813e+14 1.4e+07 2.72e+07 0.000 3.81e+14 3.81e+14
ind_Financial_Services 1.784e+13 5.29e+06 3.37e+06 0.000 1.78e+13 1.78e+13
ind_Food_and_Beverage 1.513e+14 8.39e+06 1.8e+07 0.000 1.51e+14 1.51e+14
ind_Gaming 5.151e+14 1.07e+07 4.8e+07 0.000 5.15e+14 5.15e+14
ind_Government_and_Military -5.076e+14 2.57e+07 -1.97e+07 0.000 -5.08e+14 -5.08e+14
ind_Hardware 3.49e+14 6.14e+06 5.68e+07 0.000 3.49e+14 3.49e+14
ind_Health_Care 2.069e+14 4.89e+06 4.23e+07 0.000 2.07e+14 2.07e+14
ind_Information_Technology 3.14e+13 3.44e+06 9.13e+06 0.000 3.14e+13 3.14e+13
ind_Internet_Services 1.041e+14 2.97e+06 3.51e+07 0.000 1.04e+14 1.04e+14
ind_Lending_and_Investments 4.676e+13 9.35e+06 5e+06 0.000 4.68e+13 4.68e+13
ind_Manufacturing -8.783e+14 1.87e+07 -4.7e+07 0.000 -8.78e+14 -8.78e+14
ind_Media_and_Entertainment -1.178e+14 5.02e+06 -2.35e+07 0.000 -1.18e+14 -1.18e+14
ind_Messaging_and_Telecommunications 4.145e+14 9.87e+06 4.2e+07 0.000 4.14e+14 4.14e+14
ind_Mobile 2.744e+14 5.08e+06 5.41e+07 0.000 2.74e+14 2.74e+14
ind_Music_and_Audio 4.75e+14 9.08e+06 5.23e+07 0.000 4.75e+14 4.75e+14
ind_Natural_Resources 4.378e+15 7.19e+07 6.09e+07 0.000 4.38e+15 4.38e+15
ind_Navigation_and_Mapping -1.232e+15 2.63e+07 -4.69e+07 0.000 -1.23e+15 -1.23e+15
ind_Other -2.438e+14 3.98e+06 -6.13e+07 0.000 -2.44e+14 -2.44e+14
ind_Payments 3.352e+14 9.23e+06 3.63e+07 0.000 3.35e+14 3.35e+14
ind_Platforms -4.84e+14 9.97e+06 -4.85e+07 0.000 -4.84e+14 -4.84e+14
ind_Privacy_and_Security -1.772e+15 6.39e+06 -2.77e+08 0.000 -1.77e+15 -1.77e+15
ind_Professional_Services -1.445e+14 4.33e+06 -3.34e+07 0.000 -1.44e+14 -1.44e+14
ind_Real_Estate 4.78e+13 6.96e+06 6.87e+06 0.000 4.78e+13 4.78e+13
ind_Sales_and_Marketing 6.555e+14 5.19e+06 1.26e+08 0.000 6.56e+14 6.56e+14
ind_Science_and_Engineering -2.546e+14 9.86e+06 -2.58e+07 0.000 -2.55e+14 -2.55e+14
ind_Software 3.499e+14 2.93e+06 1.19e+08 0.000 3.5e+14 3.5e+14
ind_Sports -8.371e+13 7.81e+06 -1.07e+07 0.000 -8.37e+13 -8.37e+13
ind_Sustainability 4.078e+14 3.47e+07 1.17e+07 0.000 4.08e+14 4.08e+14
ind_Transportation 1.801e+14 8.49e+06 2.12e+07 0.000 1.8e+14 1.8e+14
ind_Travel_and_Tourism 1.813e+14 8.42e+06 2.15e+07 0.000 1.81e+14 1.81e+14
ind_Video 3.871e+14 8.11e+06 4.77e+07 0.000 3.87e+14 3.87e+14
big5_max_open -1.938e+14 1.26e+07 -1.54e+07 0.000 -1.94e+14 -1.94e+14
big5_max_conscient 2.373e+14 1.14e+07 2.08e+07 0.000 2.37e+14 2.37e+14
big5_max_extra 8.295e+13 9.66e+06 8.59e+06 0.000 8.29e+13 8.29e+13
big5_max_agree 2.838e+14 1.03e+07 2.75e+07 0.000 2.84e+14 2.84e+14
big5_max_neuro 2.197e+13 7.24e+06 3.03e+06 0.000 2.2e+13 2.2e+13
Openness_facet_adventurousness_percentile -1.634e+13 1.31e+07 -1.25e+06 0.000 -1.63e+13 -1.63e+13
Openness_facet_artistic_interests_percentile 1.647e+13 1.35e+07 1.22e+06 0.000 1.65e+13 1.65e+13
Openness_facet_emotionality_percentile -2.286e+14 1.4e+07 -1.64e+07 0.000 -2.29e+14 -2.29e+14
Openness_facet_imagination_percentile -2.959e+14 1.48e+07 -2e+07 0.000 -2.96e+14 -2.96e+14
Openness_facet_intellect_percentile 5.425e+12 1.37e+07 3.96e+05 0.000 5.42e+12 5.42e+12
Openness_facet_liberalism_percentile 5.331e+14 1.34e+07 3.97e+07 0.000 5.33e+14 5.33e+14
Conscientiousness_facet_achievement_striving_percentile 9.033e+14 2.24e+07 4.04e+07 0.000 9.03e+14 9.03e+14
Conscientiousness_facet_cautiousness_percentile -4.928e+13 1.51e+07 -3.27e+06 0.000 -4.93e+13 -4.93e+13
Conscientiousness_facet_dutifulness_percentile 3.585e+14 1.3e+07 2.76e+07 0.000 3.59e+14 3.59e+14
Conscientiousness_facet_orderliness_percentile 3.518e+14 1.27e+07 2.77e+07 0.000 3.52e+14 3.52e+14
Conscientiousness_facet_self_discipline_percentile -2.151e+14 1.74e+07 -1.24e+07 0.000 -2.15e+14 -2.15e+14
Conscientiousness_facet_self_efficacy_percentile 1.552e+14 1.85e+07 8.37e+06 0.000 1.55e+14 1.55e+14
Extraversion_facet_activity_level_percentile -4.762e+14 1.85e+07 -2.58e+07 0.000 -4.76e+14 -4.76e+14
Extraversion_facet_assertiveness_percentile -4.253e+14 1.92e+07 -2.22e+07 0.000 -4.25e+14 -4.25e+14
Extraversion_facet_cheerfulness_percentile 1.039e+15 1.39e+07 7.48e+07 0.000 1.04e+15 1.04e+15
Extraversion_facet_excitement_seeking_percentile -6.25e+14 1.52e+07 -4.12e+07 0.000 -6.25e+14 -6.25e+14
Extraversion_facet_friendliness_percentile -4.842e+14 1.63e+07 -2.96e+07 0.000 -4.84e+14 -4.84e+14
Extraversion_facet_gregariousness_percentile -2.437e+14 1.48e+07 -1.65e+07 0.000 -2.44e+14 -2.44e+14
Agreeableness_facet_altruism_percentile -1.505e+13 1.64e+07 -9.19e+05 0.000 -1.51e+13 -1.51e+13
Agreeableness_facet_cooperation_percentile -2.434e+14 1.98e+07 -1.23e+07 0.000 -2.43e+14 -2.43e+14
Agreeableness_facet_modesty_percentile -8.832e+14 1.36e+07 -6.47e+07 0.000 -8.83e+14 -8.83e+14
Agreeableness_facet_morality_percentile -5.459e+14 1.42e+07 -3.84e+07 0.000 -5.46e+14 -5.46e+14
Agreeableness_facet_sympathy_percentile 5.944e+14 1.2e+07 4.94e+07 0.000 5.94e+14 5.94e+14
Agreeableness_facet_trust_percentile 5.312e+14 1.84e+07 2.88e+07 0.000 5.31e+14 5.31e+14
Emotional_range_facet_anger_percentile -4.276e+14 1.47e+07 -2.91e+07 0.000 -4.28e+14 -4.28e+14
Emotional_range_facet_anxiety_percentile -4.447e+13 1.84e+07 -2.41e+06 0.000 -4.45e+13 -4.45e+13
Emotional_range_facet_depression_percentile 6.73e+14 1.82e+07 3.69e+07 0.000 6.73e+14 6.73e+14
Emotional_range_facet_immoderation_percentile -3.093e+14 1.17e+07 -2.65e+07 0.000 -3.09e+14 -3.09e+14
Emotional_range_facet_self_consciousness_percentile -1.841e+14 1.63e+07 -1.13e+07 0.000 -1.84e+14 -1.84e+14
===========================================================================================================================
## Extended Data Figure 19
color_list = [sns.color_palette("Paired").as_hex()[1], sns.color_palette("Paired").as_hex()[5],
sns.color_palette("Greens").as_hex()[4],
sns.color_palette("Paired").as_hex()[7], sns.color_palette("Paired").as_hex()[9]]
fig = px.bar(df_results_plot, opacity = 0.6,
x="index", y="coef",template = "plotly_white",
color = 'Factors', color_discrete_sequence = color_list,
width=1500, height=900).update_traces(
error_y={
"type": "data",
"symmetric": True,
'color':'grey',
"array": df_results_plot["std err"]
}
)
fig.update_layout(title=go.layout.Title(text="Coefficient Estimates of Startups Success Model <br><sup>Specification: Success ~ Basics + Country + Personal level + Multi-Founder + Industry + Personality Combo</sup>",
font=dict(
size=24,
color="#000000")))
fig.update_yaxes(title=None)
fig.update_xaxes(title=None, tickangle=-45, tickfont=dict(color='black', size=14))
fig.update_layout(
margin=dict(r=2, b=50, pad=20)
)
# fig.update_layout(yaxis_visible=False)